#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* binary_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define PI 3.141592653589f\n"
"__kernel void binary_buf_c4_c4_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int offset=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" \n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" \n"
" float4 out=OPERATOR;\n"
" \n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+offset);\n"
"}\n"
"__kernel void binary_buf_c4_c4_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" const int offset=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" float4 out=OPERATOR;\n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+dst_offset);\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width)*16+(channel_idx % 4)*4;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"__kernel void binary_buf_c4_c16_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int src_width=shape.z+input1_pad_left+input1_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset1=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" float4 out=OPERATOR;\n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+offset0);\n"
"}\n"
"__kernel void binary_buf_c16_c4_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" const int offset1=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset0=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" float4 out=OPERATOR;\n"
" \n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+offset1);\n"
"}\n"
"__kernel void binary_buf_c4_c16_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int src_width=shape.z+input1_pad_left+input1_pad_right;\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset1=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left)*16+(channel_idx % 4)*4;\n"
" const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" float4 out=OPERATOR;\n"
" \n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+dst_offset);\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width)*16+(channel_idx % 4)*4;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"__kernel void binary_buf_c16_c4_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" const int offset1=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset0=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16+(channel_idx % 4)*4;\n"
" const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0*isFull.x));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1*isFull.x));\n"
" if(isFull.x == 0) {\n"
" in0=(int4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(int4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" if(activationType == 1) {\n"
" out=out>0 ? out : 0;\n"
" }\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1*isFull.y));\n"
" if(isFull.x == 0) {\n"
" in0=(float4)(in0.x,in0.x,in0.x,in0.x);\n"
" }\n"
" if(isFull.y == 0) {\n"
" in1=(float4)(in1.x,in1.x,in1.x,in1.x);\n"
" }\n"
" float4 out=OPERATOR;\n"
" \n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+dst_offset);\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width)*16+(channel_idx % 4)*4;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"__kernel void prelu_buf_c4_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right\n"
" ) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" \n"
" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset1=channel_idx*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1));\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1));\n"
" float4 out=OPERATOR;\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+offset0);\n"
"}\n"
"__kernel void prelu_buf_c4_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right\n"
" ) {\n"
" if (get_global_id(0) >= global_dim0 || get_global_id(1) >= global_dim1 || get_global_id(2) >= global_dim2) \n"
" return;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int w_idx=get_global_id(0) % shape.z;\n"
" const int h_idx=get_global_id(0)/shape.z;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_global_id(1);\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int channe_out_idx=channel_idx >> 2;\n"
" \n"
" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" const int offset1=channel_idx*4;\n"
" const int offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(vload4(0,input0+offset0));\n"
" int4 in1=convert_int4(vload4(0,input1+offset1));\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=convert_float4(vload4(0,input0+offset0));\n"
" float4 in1=convert_float4(vload4(0,input1+offset1));\n"
" float4 out=OPERATOR;\n"
" #endif\n"
" vstore4(CONVERT_OUTPUT4(out),0,output+offset);\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width)*16+(channel_idx % 4)*4;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore4((OUTPUT_TYPE4)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void prelu_buf_c16_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" const int channel16=(shape.w+15)/16;\n"
" const int width_pack=(shape.z+3)/4;\n"
" const int w_idx=(get_global_id(0) % width_pack) << 2;\n"
" const int h_idx=get_global_id(0)/width_pack;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_group_id(1);\n"
" const int sglid=get_sub_group_local_id();\n"
" const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16;\n"
" const int offset1=channel_idx*16;\n"
" const int offset=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0))));\n"
" int4 in1=(int4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1+offset1))));\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0))));\n"
" float4 in1=(float4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1+offset1))));\n"
" \n"
" float4 out=OPERATOR;\n"
" #endif\n"
" {\n"
" if (w_idx+4>shape.z) {\n"
" for (int i=0; i<shape.z % 4; i++) {\n"
" output[offset+i*16+sglid]=(OUTPUT_TYPE)out[i];\n"
" }\n"
" }else{\n"
" INTEL_SUB_GROUP_WRITE4((__global INTEL_DATA*)(output+offset),AS_OUTPUT_DATA4(CONVERT_OUTPUT4(out)));\n"
" }\n"
" }\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*dst_width)*16+sglid;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*16]=(OUTPUT_TYPE)0;\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*16]=(OUTPUT_TYPE)0;\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void prelu_buf_c16_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C]\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" const int channel4=(shape.w+3)/4;\n"
" const int channel16=(shape.w+15)/16;\n"
" const int width_pack=(shape.z+3)/4;\n"
" const int w_idx=(get_global_id(0) % width_pack) << 2;\n"
" const int h_idx=get_global_id(0)/width_pack;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_group_id(1);\n"
" const int sglid=get_sub_group_local_id();\n"
" const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int batch_width_height=shape.x*shape.z*shape.y*4;\n"
" const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16;\n"
" const int offset1=channel_idx*16;\n"
" const int offset=(((batch_idx+(channel_idx<<2)*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0))));\n"
" int4 in1=(int4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1+offset1))));\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0))));\n"
" float4 in1=(float4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1+offset1))));\n"
" \n"
" float4 out=OPERATOR;\n"
" #endif\n"
" const int lid_x=sglid % 4;\n"
" const int lid_y=sglid/4;\n"
" int block_size=w_idx+4>shape.z ? (shape.z % 4) : 4;\n"
" for (int i=0; i<block_size; i++) {\n"
" output[offset+i*4+lid_y*batch_width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void binary_buf_c16_c16_c16(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C4]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" const int channel16=(shape.w+15)/16;\n"
" const int width_pack=(shape.z+3)/4;\n"
" const int w_idx=(get_global_id(0) % width_pack) << 2;\n"
" const int h_idx=get_global_id(0)/width_pack;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_group_id(1);\n"
" const int sglid=get_sub_group_local_id();\n"
" const int src0_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int src1_width=shape.z+input1_pad_left+input1_pad_right;\n"
" const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
" const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src0_width+w_idx+input0_pad_left)*16;\n"
" const int offset1=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src1_width+w_idx+input1_pad_left)*16;\n"
" const int offset=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=isFull.x ? convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0)))) : (int4)(input0[0]);\n"
" int4 in1=isFull.y ? convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1+offset1)))) : (int4)(input1[0]);\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=isFull.x ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0)))) : (float4)(input0[0]);\n"
" float4 in1=isFull.y ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1+offset1)))) : (float4)(input1[0]);\n"
" \n"
" float4 out=OPERATOR;\n"
" #endif\n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" {\n"
" if (w_idx+4>shape.z) {\n"
" for (int i=0; i<shape.z % 4; i++) {\n"
" output[offset+i*16+sglid]=(OUTPUT_TYPE)out[i];\n"
" }\n"
" }else{\n"
" INTEL_SUB_GROUP_WRITE4((__global INTEL_DATA*)(output+offset),AS_OUTPUT_DATA4(CONVERT_OUTPUT4(out)));\n"
" }\n"
" }\n"
" if(w_idx == 0){\n"
" int pad_offset=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*dst_width)*16+sglid;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*16]=(OUTPUT_TYPE)0;\n"
" }\n"
" pad_offset += (shape.z+output_pad_left)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*16]=(OUTPUT_TYPE)0;\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void binary_buf_c16_c16_c4(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
" __private const int4 shape,//[N,H,W,C4]\n"
" __private const int2 isFull,\n"
" __private const int activationType,\n"
" __private const int input0_pad_left,__private const int input0_pad_right,\n"
" __private const int input1_pad_left,__private const int input1_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" const int channel16=(shape.w+15)/16;\n"
" const int channel4=(shape.w+3)/4;\n"
" const int width_pack=(shape.z+3)/4;\n"
" const int w_idx=(get_global_id(0) % width_pack) << 2;\n"
" const int h_idx=get_global_id(0)/width_pack;\n"
" const int batch_idx=get_global_id(2);\n"
" const int channel_idx=get_group_id(1);\n"
" const int sglid=get_sub_group_local_id();\n"
" const int src0_width=shape.z+input0_pad_left+input0_pad_right;\n"
" const int src1_width=shape.z+input1_pad_left+input1_pad_right;\n"
" const int batch_width_height=shape.x*shape.z*shape.y*4;\n"
" const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src0_width+w_idx+input0_pad_left)*16;\n"
" const int offset1=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src1_width+w_idx+input1_pad_left)*16;\n"
" const int offset=(((batch_idx+(channel_idx << 2)*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0=isFull.x ? convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0)))) : (int4)(input0[0]);\n"
" int4 in1=isFull.y ? convert_int4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1+offset1)))) : (int4)(input1[0]);\n"
" int4 out=in0 % in1;\n"
" out=((out<(int4)0 && in1>(int4)0) || (out>(int4)0 && in1<(int4)0)) ? out+in1 : out;\n"
" #else\n"
" float4 in0=isFull.x ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0)))) : (float4)(input0[0]);\n"
" float4 in1=isFull.y ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1+offset1)))) : (float4)(input1[0]);\n"
" \n"
" float4 out=OPERATOR;\n"
" #endif\n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" }\n"
" const int lid_x=sglid % 4;\n"
" const int lid_y=sglid/4;\n"
" int block_size=w_idx+4>shape.z ? (shape.z % 4) : 4;\n"
" for (int i=0; i<block_size; i++) {\n"
" output[offset+i*4+lid_y*batch_width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
" }\n"
"}\n"
;
#endif
#endif
}
